{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Claude 3 VS ChatGpt-4   测评"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unstructured.partition.pdf import partition_pdf\n",
    "from typing import Any\n",
    "import textwrap\n",
    "\n",
    "from pydantic import BaseModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "path =\"E:\\\\langchain_RAG\\\\data\\\\\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name\n",
      "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n",
      "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "raw_pdf_elements = partition_pdf(\n",
    "    filename=path + \"first_page_output.pdf\",\n",
    "    extract_images_in_pdf=False,\n",
    "    infer_table_structure=True,\n",
    "    chunking_strategy=\"by_title\",\n",
    "    max_characters=4000,\n",
    "    new_after_n_chars=3800,\n",
    "    combine_text_under_n_chars=2000,\n",
    "    image_output_dir_path=path,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{\"<class 'unstructured.documents.elements.CompositeElement'>\": 3,\n",
       " \"<class 'unstructured.documents.elements.Table'>\": 2}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a dictionary to store counts of each type\n",
    "category_counts = {}\n",
    "\n",
    "for element in raw_pdf_elements:\n",
    "    category = str(type(element))\n",
    "    if category in category_counts:\n",
    "        category_counts[category] += 1\n",
    "    else:\n",
    "        category_counts[category] = 1\n",
    "\n",
    "# Unique_categories will have unique elements\n",
    "unique_categories = set(category_counts.keys())\n",
    "category_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2\n",
      "3\n"
     ]
    }
   ],
   "source": [
    "class Element(BaseModel):\n",
    "    type: str\n",
    "    text: Any\n",
    "\n",
    "\n",
    "# Categorize by type\n",
    "categorized_elements = []\n",
    "for element in raw_pdf_elements:\n",
    "    if \"unstructured.documents.elements.Table\" in str(type(element)):\n",
    "        categorized_elements.append(Element(type=\"table\", text=str(element)))\n",
    "    elif \"unstructured.documents.elements.CompositeElement\" in str(type(element)):\n",
    "        categorized_elements.append(Element(type=\"text\", text=str(element)))\n",
    "\n",
    "# Tables\n",
    "table_elements = [e for e in categorized_elements if e.type == \"table\"]\n",
    "print(len(table_elements))\n",
    "\n",
    "# Text\n",
    "text_elements = [e for e in categorized_elements if e.type == \"text\"]\n",
    "print(len(text_elements))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from langchain_core.prompts import ChatPromptTemplate\n",
    "from langchain_openai import ChatOpenAI"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Gpt-4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from getpass import getpass\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = getpass()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prompt\n",
    "prompt_text = \"\"\"您是一名负责总结表格和文本的助理。\\ \n",
    "请对表格或文本块给出简明但精准的总结。 表格和文本内容: {element} \"\"\"\n",
    "prompt = ChatPromptTemplate.from_template(prompt_text)\n",
    "\n",
    "# Summary chain\n",
    "model_gpt4 = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
    "summarize_chain = {\"element\": lambda x: x} | prompt | model_gpt4 | StrOutputParser()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply to text\n",
    "texts = [i.text for i in text_elements]\n",
    "text_summaries_gpt4 = summarize_chain.batch(texts, {\"max_concurrency\": 5})\n",
    "\n",
    "# Apply to tables\n",
    "tables = [i.text for i in table_elements]\n",
    "table_summaries_gpt4 = summarize_chain.batch(tables, {\"max_concurrency\": 5})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============== 这是一份关于英伟达(NVIDIA)的证券研究报告，日期为2023年8月31日。报告中，分析师推荐买入英伟达的股票。报告强\n",
      "调，由于AI的高景气度驱动，英伟达在2024财年第二季度的业绩增长强劲，且持续推出新品以满足市场需求。报告详细列出了英伟\n",
      "达在该季度的财务表现，包括总收入、各业务收入、毛利率和净利润等数据，并对下一季度的财务表现进行了预测。总体来看，英伟达的\n",
      "财务表现和前景被看好。\n",
      "============== 该文本主要讨论了一家全球领先的加速计算芯片公司的市场走势和业务亮点。公司的数据中心业务收入同比增长近两倍，游戏业务环比持\n",
      "续恢复，汽车业务和专业可视化业务也有稳定的表现。公司在AI计算领域的地位得益于其持续推出的新产品，如L40\n",
      "GPU、GH200 Grace Hopper平台和DGX GH200人工智能平台。预计2024-2026财年公司收入和净\n",
      "利润将大幅增长。考虑到公司的产品力和AI带来的强劲需求，给予公司目标价中值582美元，首次覆盖给予“买入”评级。但也需要\n",
      "注意到下游AI应用发展、AI芯片竞争以及宏观与地缘风险等可能的风险因素。\n",
      "============== 这是一份基于公司财报和国信证券经济研究所预测的资料，其中包含了摊薄每股收益的计算，这是按照最新的总股本进行的。同时，阅读\n",
      "这份资料时，需要注意其后的免责声明及其相关内容。\n"
     ]
    }
   ],
   "source": [
    "for text_summariy in text_summaries_gpt4:\n",
    "    print('==============' ,textwrap.fill(text_summariy , width=60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============== 该表格提供了一项投资的详细信息。首次评级为买入，合理估值在565.00 - 600.00美元之间，而收盘价为460美元。\n",
      "总市值和流通市值均为11366亿美元。在过去的52周中，最高价和最低价分别为482美元和140美元。近3个月的日均成交额\n",
      "为271亿美元。\n",
      "============== 这个表格提供了一个公司从2022年到2026年的盈利预测和财务指标。这些指标包括营业收入、调整后归母净利润、调整后每股收\n",
      "益（EPS）、EBIT利润率、净资产收益率（ROE）、市盈率（PE）、EV/EBITDA和市净率（PB）。预测显示，公司\n",
      "的营业收入和调整后归母净利润预计将在这段时间内持续增长。同时，公司的EBIT利润率、净资产收益率和市盈率也预计将有所提高\n",
      "。然而，EV/EBITDA和市净率预计将逐年下降。\n"
     ]
    }
   ],
   "source": [
    "for table_summaries in table_summaries_gpt4:\n",
    "    print('==============' ,textwrap.fill(table_summaries , width=60))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Claude 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_anthropic import ChatAnthropic\n",
    "\n",
    "os.environ[\"ANTHROPIC_API_KEY\"] = getpass()\n",
    "\n",
    "chat_claude_3_opus = ChatAnthropic(temperature=0, model_name=\"claude-3-opus-20240229\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "summarize_chain_claude = {\"element\": lambda x: x} | prompt | chat_claude_3_opus | StrOutputParser()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply to text\n",
    "text_summaries_claude = summarize_chain_claude.batch(texts, {\"max_concurrency\": 5})\n",
    "# Apply to tables\n",
    "table_summaries_claude = summarize_chain_claude.batch(tables, {\"max_concurrency\": 5})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============== 总结:  1. 英伟达2024财年第二季度业绩表现强劲,主要得益于大模型快速发展驱动的高需求景气度。  2.\n",
      "整体收入135.1亿美元,同比增长101%,环比增长88%,超出此前指引。毛利率和净利润也大幅提升。  3. 分业务来看\n",
      ",数据中心业务收入103.2亿美元,同比增长171%,环比增长141%,是主要增长动力。游戏、专业可视化、汽车业务表现各\n",
      "异。  4. 展望2024财年第三季度,公司预计营收160亿美元,毛利率在71.5%-72.5%,营运费用29.5亿美元\n",
      ",显著优于市场预期。  5. 英伟达持续推出新品以满足AI快速发展带来的市场需求。\n",
      "============== 总结:  1. 英伟达是全球加速计算芯片龙头,受益于人工智能与大模型浪潮,高性能AI计算芯片需求旺盛。  2.\n",
      "公司业务表现亮眼:    - 数据中心业务收入同比增长171%,有望进一步放量;    -\n",
      "游戏业务环比持续恢复,五个季度首次实现同比正增长;    - 汽车业务受益于智能驾驶功能渗透,收入同比增长15%;\n",
      "- 专业可视化业务虽同比下降24%,但环比增长28%,持续推出新品。  3. 英伟达持续推出新品如L40\n",
      "GPU、GH200平台等,巩固其在AI计算领域的龙头地位。  4.\n",
      "国信证券预计公司未来三年收入和利润保持高增长,首次覆盖给予\"买入\"评级,目标价582美元。  5.\n",
      "潜在风险包括下游AI应用发展不及预期、竞争加剧以及宏观与地缘风险。\n",
      "============== 根据提供的表格数据,可以总结出以下要点:  1.\n",
      "该公司2021-2024年的营业收入持续增长,年复合增长率约为18.6%。  2.\n",
      "归母净利润也保持稳健增长,2021-2024年的年复合增长率约为22.7%。  3.\n",
      "公司的毛利率和净利率水平较为稳定,毛利率维持在30%左右,净利率在20%左右。  4.\n",
      "摊薄每股收益逐年提高,2024年预计达到1.04元,相比2021年的0.54元有大幅提升。  5. 总的来看,该公司未来\n",
      "几年的营收和利润有望保持较快增长,盈利能力不断增强,成长性良好。但具体投资还需结合公司所处行业前景、竞争优势、估值水平等\n",
      "多方面因素进行综合分析。\n"
     ]
    }
   ],
   "source": [
    "for text_summariy in text_summaries_claude:\n",
    "    print('==============' ,textwrap.fill(text_summariy , width=60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============== 这是一份关于某公司股票的投资评级报告。主要信息包括:  1.\n",
      "给出\"买入\"的首次评级,认为合理估值区间为565至600美元。  2. 目前收盘价为460美元,低于估值区间。  3.\n",
      "总市值和流通市值均为11366亿美元。  4. 52周最高价为482美元,最低价为140美元,目前股价处于区间中位。\n",
      "5. 近3个月日均成交额为271亿美元,成交活跃。\n",
      "综上,该股首次获得\"买入\"评级,当前股价低于合理估值,若看好公司前景,可考虑买入。但投资需谨慎,务必关注市场风险。\n",
      "============== 该表格展示了一家公司2022年至2026年的盈利预测和财务指标。主要信息总结如下:  1.\n",
      "营业收入预计从2022年的269.14亿美元增长到2026年的926.99亿美元,年复合增长率为36.2%。  2.\n",
      "调整后归母净利润预计从2022年的97.52亿美元增长到2026年的469.44亿美元,年复合增长率为48.1%。\n",
      "3. 调整后每股收益(EPS)预计从2022年的3.95美元增长到2026年的19.01美元。  4.\n",
      "EBIT利润率预计从2022年的36.6%提高到2026年的56.4%。  5.\n",
      "净资产收益率(ROE)在2026年预计达到34.9%。  6.\n",
      "市盈率(PE)和EV/EBITDA倍数呈下降趋势,表明公司估值水平逐年下降。  7.\n",
      "市净率(PB)在2026年预计下降至8倍。\n",
      "总的来说,该公司预计在未来几年将保持强劲的收入和利润增长,盈利能力不断提升,同时估值水平逐步回归合理区间。\n"
     ]
    }
   ],
   "source": [
    "for table_summaries in table_summaries_claude:\n",
    "    print('==============' ,textwrap.fill(table_summaries , width=60))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 视觉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "这张图片是一个股票价格图表，显示了特斯拉（Tesla, Inc. - TSLA）在特定一天内的股票价格变动。图表显示时间\n",
      "范围是从早上10点到下午3点30分。股票价格的变化用蓝色折线图表示，垂直轴（右侧）显示的是价格范围，水平轴（底部）显示的\n",
      "时间。图表中还包含了一条红色虚线，这可能表示当天的开盘价或者某个特定的价格水平。  从图表上看，可以观察到股票在开盘后不\n",
      "久价格开始下跌，随后整个跟踪期间价格波动较大，但总体趋势呈下降。截至图表最右端的价格是175.34美元，这可能是当天收盘\n",
      "价或者当前时刻的价格。图表顶部的\"P: N/A\", \"O: N/A\", \"H: N/A\", \"L: N/A\", \"C:\n",
      "N/A\", \"VOL: N/A\"应该代表当天的当前价（Price）、开盘价（Open）、最高价（High）、最低价（Lo\n",
      "w）、收盘价（Close）和成交量（Volume），但似乎没有提供具体的数值（显示为\"Not Available\"）。\n",
      "此外，图表顶部还有一系列的工具选项，比如可以进行比较（Comparison）、设置指标（Indicators）、查看技术\n",
      "分析（Technicals）或者查看公司事件（Corporate Events）。左上角的Yahoo! Finance \n",
      "标志表明这是雅虎财经提供的股票价格图表。图表右上角有分享（Share）和设置（Settings）选项。整个图表的布局和设\n",
      "计显示了专业的金融信息服务风格。\n"
     ]
    }
   ],
   "source": [
    "from openai import OpenAI\n",
    "\n",
    "client = OpenAI()\n",
    "response = client.chat.completions.create(\n",
    "  model=\"gpt-4-vision-preview\",\n",
    "  messages=[\n",
    "    {\n",
    "      \"role\": \"user\",\n",
    "      \"content\": [\n",
    "        {\n",
    "          \"type\": \"text\",\n",
    "          \"text\": \"描述这张图片\",\n",
    "        },\n",
    "        {\n",
    "          \"type\": \"image_url\",\n",
    "          \"image_url\": {\n",
    "            \"url\": \"https://i.postimg.cc/zGPLQLT6/tesla.png\",\n",
    "          },\n",
    "        },\n",
    "        \n",
    "      ],\n",
    "    }\n",
    "  ],\n",
    "  max_tokens=500,\n",
    ")\n",
    "\n",
    "print(textwrap.fill(response.choices[0].message.content , width=60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "这是一张特斯拉(TSLA)股票在某日当天的股价走势图。横轴显示时间从上午10点到下午3点半,纵轴是股价。图中蓝色的折线图\n",
      "表现了股价的波动情况,整体呈现出先下降后上升的趋势。股价最低点出现在下午1点左右,达到约175.34美元,之后股价逐步回\n",
      "升。图表右上角显示当前股价为176.34美元。整体来看,当日股价波动幅度不算太大,主要在175至181美元之间浮动。\n"
     ]
    }
   ],
   "source": [
    "import anthropic\n",
    "import base64\n",
    "import httpx\n",
    "\n",
    "client = anthropic.Anthropic()\n",
    "\n",
    "image1_url = \"https://i.postimg.cc/zGPLQLT6/tesla.png\"\n",
    "image1_media_type = \"image/png\"\n",
    "image1_data = base64.b64encode(httpx.get(image1_url).content).decode(\"utf-8\")\n",
    "\n",
    "message = client.messages.create(\n",
    "    model=\"claude-3-opus-20240229\",\n",
    "    max_tokens=500,\n",
    "    messages=[\n",
    "        {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": [\n",
    "                {\n",
    "                    \"type\": \"image\",\n",
    "                    \"source\": {\n",
    "                        \"type\": \"base64\",\n",
    "                        \"media_type\": image1_media_type,\n",
    "                        \"data\": image1_data,\n",
    "                    },\n",
    "                },\n",
    "                {\n",
    "                    \"type\": \"text\",\n",
    "                    \"text\": \"描述这张图片\"\n",
    "                }\n",
    "            ],\n",
    "        }\n",
    "    ],\n",
    ")\n",
    "print(textwrap.fill(message.content[0].text , width=60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llangchainhf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
